{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "cdee651e-d8a9-483c-9982-97d714ee86fa",
   "metadata": {},
   "outputs": [],
   "source": [
    "!pip install -U pymilvus\n",
    "!pip install -U 'pymilvus[model]'\n",
    "!pip install -U scikit-learn"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "fa7ba260-f935-4286-a8f4-7010f8c23547",
   "metadata": {},
   "outputs": [],
   "source": [
    "docs = [\n",
    "    \"Artificial intelligence was founded as an academic discipline in 1956.\",\n",
    "    \"Alan Turing was the first person to conduct substantial research in AI.\",\n",
    "    \"Born in Maida Vale, London, Turing was raised in southern England.\",\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "b7997280-befd-4d1b-84d7-71113765b87c",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/zilliz/.pyenv/lib/python3.9/site-packages/urllib3/__init__.py:34: NotOpenSSLWarning: urllib3 v2 only supports OpenSSL 1.1.1+, currently the 'ssl' module is compiled with 'LibreSSL 2.8.3'. See: https://github.com/urllib3/urllib3/issues/3020\n",
      "  warnings.warn(\n"
     ]
    }
   ],
   "source": [
    "import random\n",
    "import string\n",
    "import numpy as np\n",
    "\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "\n",
    "from pymilvus import (\n",
    "    utility,\n",
    "    FieldSchema, CollectionSchema, DataType,\n",
    "    Collection, AnnSearchRequest, RRFRanker, connections,\n",
    ")\n",
    "\n",
    "from pymilvus.model.hybrid import BGEM3EmbeddingFunction"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "25fba354-b43b-4333-9ab9-7e3221f57bce",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<3x27 sparse matrix of type '<class 'numpy.float64'>'\n",
       "\twith 32 stored elements in Compressed Sparse Row format>"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "vectorizer = TfidfVectorizer()\n",
    "X = vectorizer.fit_transform(docs)\n",
    "X"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "57c990ce-8b91-441c-b8e4-b990555f4634",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['1956' 'academic' 'ai' 'alan' 'an' 'artificial' 'as' 'born' 'conduct'\n",
      " 'discipline' 'england' 'first' 'founded' 'in' 'intelligence' 'london'\n",
      " 'maida' 'person' 'raised' 'research' 'southern' 'substantial' 'the' 'to'\n",
      " 'turing' 'vale' 'was']\n",
      "[[0.33907746 0.33907746 0.         0.         0.33907746 0.33907746\n",
      "  0.33907746 0.         0.         0.33907746 0.         0.\n",
      "  0.33907746 0.20026461 0.33907746 0.         0.         0.\n",
      "  0.         0.         0.         0.         0.         0.\n",
      "  0.         0.         0.20026461]\n",
      " [0.         0.         0.3119513  0.3119513  0.         0.\n",
      "  0.         0.         0.3119513  0.         0.         0.3119513\n",
      "  0.         0.18424347 0.         0.         0.         0.3119513\n",
      "  0.         0.3119513  0.         0.3119513  0.3119513  0.3119513\n",
      "  0.23724701 0.         0.18424347]\n",
      " [0.         0.         0.         0.         0.         0.\n",
      "  0.         0.32751633 0.         0.         0.32751633 0.\n",
      "  0.         0.38687284 0.         0.32751633 0.32751633 0.\n",
      "  0.32751633 0.         0.32751633 0.         0.         0.\n",
      "  0.24908461 0.32751633 0.19343642]]\n"
     ]
    }
   ],
   "source": [
    "print(vectorizer.get_feature_names_out())\n",
    "print(X.todense())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "ae294dbf-ddcb-4278-8a5c-52c209795253",
   "metadata": {},
   "outputs": [],
   "source": [
    "query = \"Who started AI research?\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "9c34142e-15dc-4ee1-9ed2-f869d09ce784",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'2.4.0'"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pymilvus\n",
    "pymilvus.__version__"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "40774ae0-f6ac-4e64-af37-429bb8c424de",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/zilliz/.pyenv/lib/python3.9/site-packages/transformers/utils/generic.py:441: UserWarning: torch.utils._pytree._register_pytree_node is deprecated. Please use torch.utils._pytree.register_pytree_node instead.\n",
      "  _torch_pytree._register_pytree_node(\n",
      "/Users/zilliz/.pyenv/lib/python3.9/site-packages/transformers/utils/generic.py:309: UserWarning: torch.utils._pytree._register_pytree_node is deprecated. Please use torch.utils._pytree.register_pytree_node instead.\n",
      "  _torch_pytree._register_pytree_node(\n",
      "/Users/zilliz/.pyenv/lib/python3.9/site-packages/transformers/utils/generic.py:309: UserWarning: torch.utils._pytree._register_pytree_node is deprecated. Please use torch.utils._pytree.register_pytree_node instead.\n",
      "  _torch_pytree._register_pytree_node(\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "4874a777ee8446d08092419cb2193f77",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Fetching 23 files:   0%|          | 0/23 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "ef = BGEM3EmbeddingFunction(use_fp16=False, device=\"cpu\")\n",
    "dense_dim = ef.dim[\"dense\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "2338cf19-cd69-4ca3-9061-85d2aad12554",
   "metadata": {},
   "outputs": [],
   "source": [
    "docs_embeddings = ef(docs)\n",
    "query_embeddings = ef([query])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "09fb6332-50ba-4be1-ad05-34c4bfc4d72c",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'dense': [array([-0.02505933, -0.00142187,  0.04015452, ..., -0.0209493 ,\n",
       "          0.02623649,  0.00324106], dtype=float32),\n",
       "  array([ 0.00118473,  0.00649282, -0.0073576 , ..., -0.01446302,\n",
       "          0.04243686, -0.01794804], dtype=float32),\n",
       "  array([ 0.00415292, -0.01014923,  0.00098095, ..., -0.02559674,\n",
       "          0.08084681,  0.00141655], dtype=float32)],\n",
       " 'sparse': <3x250002 sparse array of type '<class 'numpy.float32'>'\n",
       " \twith 43 stored elements in Compressed Sparse Row format>}"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "docs_embeddings"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "fa4c8845-75ef-4a93-bb3f-e6af5cb9b1ee",
   "metadata": {},
   "outputs": [],
   "source": [
    "connections.connect(\"default\", host=\"localhost\", port=\"19530\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "ab2d9a03-7003-46be-a096-d478ce0b9265",
   "metadata": {},
   "outputs": [],
   "source": [
    "fields = [\n",
    "    # Use auto generated id as primary key\n",
    "    FieldSchema(name=\"pk\", dtype=DataType.VARCHAR,\n",
    "                is_primary=True, auto_id=True, max_length=100),\n",
    "    FieldSchema(name=\"text\", dtype=DataType.VARCHAR, max_length=512),\n",
    "    FieldSchema(name=\"sparse_vector\", dtype=DataType.SPARSE_FLOAT_VECTOR),\n",
    "    FieldSchema(name=\"dense_vector\", dtype=DataType.FLOAT_VECTOR,\n",
    "                dim=dense_dim)\n",
    "]\n",
    "schema = CollectionSchema(fields, \"\")\n",
    "col = Collection(\"sparse_dense_demo\", schema)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "c58926c8-b45d-4863-9ead-fee4526c32e9",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Status(code=0, message=)"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sparse_index = {\"index_type\": \"SPARSE_INVERTED_INDEX\", \"metric_type\": \"IP\"}\n",
    "dense_index = {\"index_type\": \"FLAT\", \"metric_type\": \"COSINE\"}\n",
    "col.create_index(\"sparse_vector\", sparse_index)\n",
    "col.create_index(\"dense_vector\", dense_index)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "e2a463ec-6440-4f8b-b284-5826627ff160",
   "metadata": {},
   "outputs": [],
   "source": [
    "entities = [docs, docs_embeddings[\"sparse\"], docs_embeddings[\"dense\"]]\n",
    "col.insert(entities)\n",
    "col.flush()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "68f54aa1-5a74-400d-a244-e39dd69858db",
   "metadata": {},
   "outputs": [],
   "source": [
    "col.load()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "d0ef6704-667b-4a0b-af60-d61fbbec964e",
   "metadata": {},
   "outputs": [],
   "source": [
    "sparse_search_params = {\"metric_type\": \"IP\"}\n",
    "sparse_req = AnnSearchRequest(query_embeddings[\"sparse\"],\n",
    "                              \"sparse_vector\", sparse_search_params, limit=2)\n",
    "dense_search_params = {\"metric_type\": \"COSINE\"}\n",
    "dense_req = AnnSearchRequest(query_embeddings[\"dense\"],\n",
    "                             \"dense_vector\", dense_search_params, limit=2)\n",
    "\n",
    "res = col.hybrid_search([sparse_req, dense_req], rerank=RRFRanker(),\n",
    "                        limit=2, output_fields=[\"text\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "id": "b924c04a-b92e-4fe7-9837-5649a05e950f",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['[\"id: 448695871357611268, distance: 0.032786883413791656, entity: {\\'text\\': \\'Alan Turing was the first person to conduct substantial research in AI.\\'}\", \"id: 448695871357611267, distance: 0.016129031777381897, entity: {\\'text\\': \\'Artificial intelligence was founded as an academic discipline in 1956.\\'}\"]']"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "res"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "394f17a5-8856-4058-847e-fa079742513f",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
